cap log close
log using "$logs/cr-covariates-v3_0.log", replace

	* using v30 covariates that can be created from the data
	
/******************************************************************************
	first: show that old/new county/school crosswalks are v. similar (even if districts are not)
*******************************************************************************/
	
	use "$raw/seda/seda_crosswalk_v30.dta", clear
	keep ncessch_orig ncessch countyid16 countyid year has_geo leaidC
	merge m:1 ncessch using "$raw/seda/SEDA_cov_school_pool_v30.dta", keepusing(totenrl) assert(1 3) nogen
	
		tab leaidC if mi(countyid16) // Missing geo/virtual
		drop if mi(countyid16)
 	
	bys ncessch_orig: egen nvals = nvals(countyid)
	egen tag = tag(ncessch)
	tab nvals if tag, m // 99% - take most recent
	
	gsort ncessch_orig -year
	collapse (firstnm) countyid (mean) totenrl, by(ncessch_orig)
	isid ncessch_orig
	rename ncessch_orig ncessch
	
	preserve
		use "$raw/seda/seda_crosswalk_v21.dta", clear
		keep ncessch countyid year
		bys ncessch: egen nvals = nvals(countyid)
		egen tag = tag(ncessch)
		tab nvals if tag, m // 99% - take most recent
		gsort ncessch -year
		collapse (firstnm) countyid, by(ncessch)
		rename countyid countyid21
		tempfile temp
		save `temp'
	restore
	merge 1:1 ncessch using `temp'
	gen same = countyid21==countyid
	tab same // 98%
	tab same if _merge==3 // almost 100%
	bys same: sum totenrl // Smaller schools. No HUGE school.
	
	// Very similar crosswalks: use v30; use v21 for covariates that are missing.
	
/******************************************************************************
	seda covariates
*******************************************************************************/

use "$raw/seda/SEDA_cov_school_pool_v30.dta", clear
keep ncessch schnam perfl charter totenrl perblk perwht perhsp

// Get the county
preserve
	use "$raw/seda/seda_crosswalk_v30.dta", clear
	keep ncessch countyid16 // Assign 2016 countyid as SEDA does
	replace countyid="36061" if ncessch=="362058006062" // should be NYC, only one with duplicates here.
	duplicates drop
	bys ncessch: egen nvals = nvals(countyid)
	assert nvals==1 | nvals==.
	collapse (firstnm) countyid, by(ncessch)
	isid ncessch
	rename countyid countyid
	tempfile xwalk
	save `xwalk'
restore
merge 1:1 ncessch using `xwalk', assert(2 3) keep(3) nogen
	
// Investigate those missing county data	
preserve
	use "$raw/seda/seda_crosswalk_v30.dta", clear
	keep if !has_geo
	keep ncessch leaidC
	duplicates drop
	tempfile temp
	save `temp'
restore
merge 1:m ncessch using `temp', keep(1 3)
tab leaidC if mi(countyid), m // nearly all are Virtual schools. Just a few have no geographic info.
keep if !mi(countyid) // Drop those with no countyid - won't be able to link it to county-level data, <1%
drop _merge leaidC
duplicates drop
isid ncessch
destring countyid, replace

// Merge with county-level covariates
merge m:1 countyid using "$raw/seda/SEDA_cov_county_pool_v30.dta", ///
	keepusing(fips hswhtblk hsflnfl sesavgall sesavgwhtblk) assert(1 3) gen(_m_county)
mdesc // 31 of 105K schools missing charter flag 
	// When MC investigated them in the CCD, if they showed up
	// They were NOT charter schools.
	// Most schools are not charter, so assign not charter.
	// Most of these are small schools.
replace charter = 0 if mi(charter)
		
// Generate enrollment counts for wht/blk
renvars totenrl perwht perhsp perblk perfl, prefix(sch_)
gen sch_wht = sch_perwht*sch_totenrl
gen sch_blk = sch_perblk*sch_totenrl
gen sch_hsp = sch_perhsp*sch_totenrl

bys countyid: egen totenrl = total(sch_totenrl)
foreach var in wht blk hsp {
	bys countyid: egen `var' = total(sch_`var')
	gen per`var' = `var'/totenrl
	}
	
// Create free-lunch measures by taking school-level FL% and collapsing to county level using WHT or BLK to weight
// As described in the v 2.1 covariate codebook.
preserve
	collapse (mean) flunch_wht = sch_perfl [aw=sch_wht], by(countyid)
	tempfile temp
	save `temp'
restore
merge m:1 countyid using `temp', nogen
preserve
	collapse (mean) flunch_blk = sch_perfl [aw=sch_blk], by(countyid)
	tempfile temp
	save `temp'
restore
merge m:1 countyid using `temp', nogen
egen tag = tag(countyid)
gen flunchwhtblk = flunch_wht-flunch_blk
sum flunch_wht flunch_blk flunchwhtblk if tag
drop flunch_wht flunch_blk

// Create per charter variables
gen temp = sch_totenrl if charter
bys countyid: egen percharter_all = total(temp)
	replace percharter_all = percharter_all/totenrl
drop temp
gen temp = sch_blk if charter
bys countyid: egen percharter_blk = total(temp)
	replace percharter_blk = percharter_blk/totenrl
drop temp
gen temp = sch_wht if charter
bys countyid: egen percharter_wht = total(temp)
	replace percharter_wht = percharter_wht/totenrl
gen percharterwhtblk = percharter_wht-percharter_blk
sum percharter_all percharterwhtblk if tag
drop temp percharter_wht percharter_blk

keep countyid perblk perhsp flunchwhtblk percharter_all percharterwhtblk hswhtblk hsflnfl sesavgall sesavgwhtblk 
duplicates drop
isid countyid

tempfile v3
save `v3'

// Use 2.1 covariates 

	// District level 
	use "$raw/seda/SEDA_cov_geodist_pool_v21", clear	
	keep leaid ppexp_inst stutch_all totenrl ratstutch_whthsp countyid
	isid leaid
	keep if !mi(countyid) // ~5%
	
		// Weight covariates by enrollment to the county level
		foreach var in ppexp_inst stutch_all ratstutch_whthsp {
		preserve
			collapse (mean) `var' [aw=totenrl], by(countyid)
			tempfile temp
			save `temp'
		restore
		drop `var'
		merge m:1 countyid using `temp', nogen
		}
		rename ratstutch_whthsp stutchwhtblk
		replace ppexp_inst = ppexp_inst/10000
	
	keep countyid ppexp_inst stutch_all stutchwhtblk
	duplicates drop
	isid countyid
	destring countyid, replace
	
merge 1:1 countyid using `v3', nogen
mdesc // Primarily missing the gap for SES wht-blk
egen rm = rowmiss(*)
keep if rm == 0
drop rm

pwcorr ppexp_inst-percharterwhtblk
rename sesavgwhtblk seswhtblk
rename sesavgall sesall

save "$data/seda_covariates_clean.dta", replace
